{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "Perceptron Classifier"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "\n",
    "data_web_address = \"https://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.data\"\n",
    "\n",
    "column_names = ['pregnancy_x',\n",
    "'plasma_con',\n",
    "'blood_pressure',\n",
    "'skin_mm',\n",
    "'insulin',\n",
    "'bmi',\n",
    "'pedigree_func',\n",
    "'age',\n",
    "'target']\n",
    "\n",
    "feature_names = column_names[:-1]\n",
    "\n",
    "all_data = pd.read_csv(data_web_address , names=column_names) \n",
    "\n",
    "X = all_data[feature_names]\n",
    "y = all_data['target']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from sklearn.model_selection import train_test_split\n",
    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,stratify=y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from sklearn.preprocessing import StandardScaler \n",
    "\n",
    "scaler = StandardScaler()\n",
    "scaler.fit(X_train) \n",
    "X_train_scaled = scaler.transform(X_train)\n",
    "X_test_scaled = scaler.transform(X_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\jdavila\\Anaconda27\\lib\\site-packages\\sklearn\\linear_model\\stochastic_gradient.py:84: FutureWarning: max_iter and tol parameters have been added in <class 'sklearn.linear_model.perceptron.Perceptron'> in 0.19. If both are left unset, they default to max_iter=5 and tol=None. If tol is not None, max_iter defaults to max_iter=1000. From 0.21, default max_iter will be 1000, and default tol will be 1e-3.\n",
      "  \"and default tol will be 1e-3.\" % type(self), FutureWarning)\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "Perceptron(alpha=0.0001, class_weight=None, eta0=1.0, fit_intercept=True,\n",
       "      max_iter=5, n_iter=None, n_jobs=1, penalty=None, random_state=0,\n",
       "      shuffle=True, tol=None, verbose=0, warm_start=False)"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.linear_model import Perceptron\n",
    "\n",
    "pr = Perceptron()\n",
    "pr.fit(X_train_scaled, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.72861097734760383"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.model_selection import cross_val_score, StratifiedKFold\n",
    "\n",
    "skf = StratifiedKFold(n_splits=3)\n",
    "cross_val_score(pr, X_train_scaled, y_train, cv=skf,scoring='roc_auc').mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Classification accuracy :  0.720779220779\n",
      "ROC-AUC Score :  0.61037037037\n"
     ]
    }
   ],
   "source": [
    "from sklearn.metrics import accuracy_score, roc_auc_score\n",
    "\n",
    "print \"Classification accuracy : \", accuracy_score(y_test, pr.predict(X_test_scaled))\n",
    "print \"ROC-AUC Score : \",roc_auc_score(y_test, pr.predict(X_test_scaled))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#scikit-learn version 0.18.1\n",
    "from sklearn.model_selection import GridSearchCV\n",
    "\n",
    "param_dist = {'alpha': [0.1,0.01,0.001,0.0001], \n",
    " 'penalty': [None, 'l2','l1','elasticnet'],\n",
    " 'random_state': [7],\n",
    " 'class_weight':['balanced',None],'eta0': [0.25,0.5,0.75,1.0], \n",
    " 'warm_start':[True,False], 'n_iter':[50]}\n",
    "\n",
    "gs_perceptron = GridSearchCV(pr, param_dist, scoring='roc_auc',cv=skf).fit(X_train_scaled, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#scikit-learn version 0.19.0\n",
    "#used this code in the book: the rest of the book code will work with both scikit 0.8.1 or 0.19.0\n",
    "from sklearn.model_selection import GridSearchCV\n",
    "\n",
    "param_dist = {'alpha': [0.1,0.01,0.001,0.0001], \n",
    " 'penalty': [None, 'l2','l1','elasticnet'],\n",
    " 'random_state': [7],\n",
    " 'class_weight':['balanced',None],'eta0': [0.25,0.5,0.75,1.0], \n",
    " 'warm_start':[True,False], 'max_iter':[50], 'tol':[1e-3]}\n",
    "\n",
    "gs_perceptron = GridSearchCV(pr, param_dist, scoring='roc_auc',cv=skf).fit(X_train_scaled, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'alpha': 0.001,\n",
       " 'class_weight': None,\n",
       " 'eta0': 0.5,\n",
       " 'max_iter': 50,\n",
       " 'penalty': 'l2',\n",
       " 'random_state': 7,\n",
       " 'tol': 0.001,\n",
       " 'warm_start': True}"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "gs_perceptron.best_params_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.79221656570311072"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "gs_perceptron.best_score_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Perceptron(alpha=0.001, class_weight=None, eta0=0.5, fit_intercept=True,\n",
       "      max_iter=50, n_iter=None, n_jobs=1, penalty='l2', random_state=7,\n",
       "      shuffle=True, tol=0.001, verbose=0, warm_start=True)"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "best_perceptron = gs_perceptron.best_estimator_\n",
    "best_perceptron"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from sklearn.ensemble import BaggingClassifier \n",
    "param_dist = {\n",
    " 'max_samples': [0.5,1.0],\n",
    " 'max_features' : [0.5,1.0],\n",
    " 'oob_score' : [True, False],\n",
    " 'n_estimators': [100],\n",
    " 'n_jobs':[-1],\n",
    " 'base_estimator__alpha': [0.001,0.002],\n",
    " 'base_estimator__penalty': [None, 'l2','l1','elasticnet'], }\n",
    "\n",
    "ensemble_estimator = BaggingClassifier(base_estimator = best_perceptron)\n",
    "bag_perceptrons = GridSearchCV(ensemble_estimator, param_dist,scoring='roc_auc',cv=skf,n_jobs=-1).fit(X_train_scaled, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.83299842529587864"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "bag_perceptrons.best_score_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'base_estimator__alpha': 0.001,\n",
       " 'base_estimator__penalty': 'l1',\n",
       " 'max_features': 1.0,\n",
       " 'max_samples': 1.0,\n",
       " 'n_estimators': 100,\n",
       " 'n_jobs': -1,\n",
       " 'oob_score': True}"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "bag_perceptrons.best_params_"
   ]
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
